cont <- params$controlCellLine
exp <- params$experimentalCellLine
countmatrix.all <- params$countmatrix.all
metadata.all <- params$metadata.all
rm(params) # Remove the parameters so that we can make subsequent parameterized calls
metadata.pair <- as.data.frame(metadata.all) %>%
filter(CellLine == cont | CellLine == exp)
as.data.frame(metadata.pair)
countmatrix.pair <- countmatrix.all[, metadata.pair$ShortName]
as.data.frame(countmatrix.pair)
Run Deseq on the data set.
# Saving time by just loading the dds we already ran (recent changes are all after this point)
load(str_interp("Rdata/${exp}_vs_${cont}_dds.RData"))
res <- results(dds.pair, contrast = c("CellLine", exp, cont), alpha = 0.05)
res <- res[order(res$log2FoldChange), ]
outFile <- str_interp("output/${exp}_vs_${cont}_deseq_results.csv")
write.csv(as.data.frame(res), file = outFile)
Filter res for padj < 0.05
res.filtered <- as.data.frame(res) %>%
filter(padj < 0.05)
# filter(log2FoldChange >= 1.5 | log2FoldChange <= -1.5)
res.filtered <- res.filtered[order(res.filtered$log2FoldChange, decreasing = TRUE),]
res.filtered
up.unfiltered <- subset(res, log2FoldChange > 0)
up.unfiltered <- up.unfiltered[order(up.unfiltered$log2FoldChange, decreasing = TRUE), ]
outFile <- str_interp("output/${exp}_vs_${cont}_all_upregulated_genes.csv")
write.csv(up.unfiltered[, c("log2FoldChange", "padj")], file = outFile)
up.unfiltered[, c("log2FoldChange", "padj")]
log2 fold change (MLE): CellLine OVCAR4A vs OVCAR4
DataFrame with 10826 rows and 2 columns
log2FoldChange padj
<numeric> <numeric>
LOC102723570 9.67777 1.10711e-05
HNRNPUL2-BSCL2 9.32131 6.13918e-02
LOC102724441 8.97752 8.72190e-02
RNU1-28P 8.89543 9.20413e-02
LY75-CD302 8.63483 1.04412e-01
... ... ...
SART3 0.000489767 0.999027
ILF2 0.000415536 0.999335
CIPC 0.000294212 0.999445
DYNC1LI1 0.000246635 0.999353
SLC25A17 0.000139774 0.999676
up <- subset(res.filtered, log2FoldChange > 0)
up <- up[order(up$log2FoldChange, decreasing = TRUE), ]
outFile <- str_interp("output/${exp}_vs_${cont}_significantly_upregulated_genes.csv")
write.csv(up[, c("log2FoldChange", "padj")], file = outFile)
print(up[, c("log2FoldChange", "padj")])
down.unfiltered <- subset(res, log2FoldChange < 0)
down.unfiltered <- down.unfiltered[order(down.unfiltered$log2FoldChange, decreasing = FALSE), ]
outFile <- str_interp("output/${exp}_vs_${cont}_all_downregulated_genes.csv")
write.csv(down.unfiltered[, c("log2FoldChange", "padj")], file = outFile)
print(down.unfiltered[, c("log2FoldChange", "padj")])
log2 fold change (MLE): CellLine OVCAR4A vs OVCAR4
DataFrame with 10125 rows and 2 columns
log2FoldChange padj
<numeric> <numeric>
RNU1-4 -22.09919 6.48243e-07
SAMD5 -10.92849 5.75195e-17
LOC102724219 -10.86442 3.10874e-02
SEMA6A -9.06626 1.67112e-45
TPTE -8.93871 5.17705e-10
... ... ...
PRR12 -0.000682375 0.998978
SLED1 -0.000514870 0.999694
PSMC5 -0.000461366 0.999027
AP4M1 -0.000421076 0.999169
TAF4 -0.000298641 0.999335
down <- subset(res.filtered, log2FoldChange < 0)
down <- down[order(down$log2FoldChange, decreasing = TRUE), ]
outFile <- str_interp("output/${exp}_vs_${cont}_significantly_downregulated_genes.csv")
write.csv(down[, c("log2FoldChange", "padj")], file = outFile)
print(down[, c("log2FoldChange", "padj")])
as.data.frame(res) %>%
ggplot(aes(x = log2FoldChange, y = -log10(padj), label = rownames(res))) +
geom_point() +
theme_minimal() +
scale_color_manual(values = c("black", "blue", "red")) +
geom_text_repel() +
geom_hline(yintercept = 1.301) +
geom_vline(xintercept = 1.2) +
geom_vline(xintercept = -1.2) +
xlim(-10, 10)
Warning: Removed 14357 rows containing missing values (`geom_point()`).
Warning: Removed 14357 rows containing missing values (`geom_text_repel()`).
Warning: ggrepel: 15370 unlabeled data points (too many overlaps). Consider increasing
max.overlaps
Perform gene set enrichment analysis using Cluster Profiler. This gives us GO pathways that are significantly regulated based on the log2fold change of expression of individual genes.
Using a pvalue Cutoff of 0.05
gene_list <- res$log2FoldChange
names(gene_list) <- rownames(res)
gene_list <- sort(gene_list, decreasing = TRUE)
# Set the seed so our results are reproducible:
set.seed(2023)
gsea_res <- gseGO(gene_list, ont = "BP", OrgDb = "org.Hs.eg.db", keyType = "SYMBOL", seed = TRUE, pvalueCutoff = 0.05)
preparing geneSet collections...
GSEA analysis...
Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (14.85% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize = minSize, : For some
pathways, in reality P-values are less than 1e-10. You can set the `eps` argument to zero for
better estimation.
leading edge analysis...
done...
# Format output
gsea_res_df <- as.data.frame(gsea_res)
gsea_res_df <- gsea_res_df %>%
mutate(original_row_num = row_number())
gsea_res_df <- gsea_res_df[order(gsea_res_df$NES, decreasing = TRUE),]
row.names(gsea_res_df) <- gsea_res_df$ID
NES is the normalized enrichment score.
gsea_res_df_short <- gsea_res_df[c("pvalue", "p.adjust", "NES", "Description")]
gsea_res_df_short$"core_enrichment_genes" <- gsea_res_df$core_enrichment
gsea_res_df_short.up <- subset(gsea_res_df_short, gsea_res_df_short$NES >= 0)
outFile <- str_interp("output/${exp}_vs_${cont}_significantly_upregulated_pathways.csv")
write.csv(gsea_res_df_short.up, file = outFile)
gsea_res_df_short.up
GSEA plot of the five most upregulated pathways (or least downregulated)
maxIndex <- min(5, nrow(gsea_res_df)) # Prevents us from trying to access out of bounds if there are not five pathways
top5PathwaysIds = gsea_res_df[1:maxIndex, "original_row_num"]
gseaplot2(gsea_res, geneSetID = top5PathwaysIds, pvalue_table = FALSE, ES_geom = "dot")
Volcano Plot (Average NES & adjusted p value)
as.data.frame(gsea_res_df_short.up) %>%
ggplot(aes(x = NES, y = -log10(p.adjust), label = rownames(gsea_res_df_short.up))) +
geom_point() +
theme_minimal() +
scale_color_manual(values = c("black", "blue", "red")) +
geom_text_repel() +
geom_hline(yintercept = 1.301) +
geom_vline(xintercept = 1.2) +
geom_vline(xintercept = -1.2) +
xlim(-10, 10)
Warning: ggrepel: 223 unlabeled data points (too many overlaps). Consider increasing
max.overlaps
gsea_res_df_short.down <- subset(gsea_res_df_short, gsea_res_df_short$NES <= 0)
outFile <- str_interp("output/${exp}_vs_${cont}_significantly_downregulated_pathways.csv")
write.csv(gsea_res_df_short.down, file = outFile)
gsea_res_df_short.down
GSEA plot of the five most downregulated pathways (or least upregulated)
minIndex <- max(1, nrow(gsea_res_df) - 5) # Prevents us from trying to access out of bounds if there are not five downregulated pathways
bottom5PathwaysIds = gsea_res_df[minIndex:nrow(gsea_res_df), "original_row_num"]
gseaplot2(gsea_res, geneSetID = bottom5PathwaysIds, pvalue_table = FALSE, ES_geom = "dot")
Volcano plot (Average NES & adjusted p value)
as.data.frame(gsea_res_df_short.down) %>%
ggplot(aes(x = NES, y = -log10(p.adjust), label = rownames(gsea_res_df_short.down))) +
geom_point() +
theme_minimal() +
scale_color_manual(values = c("black", "blue", "red")) +
geom_text_repel() +
geom_hline(yintercept = 1.301) +
geom_vline(xintercept = 1.2) +
geom_vline(xintercept = -1.2) +
xlim(-10, 10)
Warning: ggrepel: 15 unlabeled data points (too many overlaps). Consider increasing
max.overlaps
Use Revigo to cluster upregulated pathways
revigo_input.cellline.up <- gsea_res_df_short.up[c("p.adjust")]
rownames(revigo_input.cellline.up) <- rownames(gsea_res_df_short.up)
simMatrix <- calculateSimMatrix(rownames(revigo_input.cellline.up),
orgdb = "org.Hs.eg.db",
ont = "BP",
method = "Rel"
)
preparing gene to GO mapping data...
preparing IC data...
Warning in calculateSimMatrix(rownames(revigo_input.cellline.up), orgdb = "org.Hs.eg.db", :
Removed 1 terms that were not found in orgdb for BP
scores <- setNames(-log10(revigo_input.cellline.up$p.adjust), rownames(revigo_input.cellline.up))
if (nrow(revigo_input.cellline.up) > 1) {
reducedTerms <- reduceSimMatrix(simMatrix,
scores,
threshold = 0.7,
orgdb = "org.Hs.eg.db"
)
} else {
reducedTerms <- data.frame(matrix(ncol = 0, nrow = 0))
print("There will be no graphs appearing below this because there were not enough significantly upregulated pathways to meaningfully cluster them")
}
Revigo interactive scatter plot. Distances represent the similarity between terms, axes are the first 2 components of a PCA plot, Each bubble indicates the representative (chosen mostly by p-value) from a cluster of terms. Size of the bubble indicates the generality of the term (large meaning a more general term).
if (nrow(reducedTerms) > 2) {
revigo_scatterplot(simMatrix, reducedTerms)
}
Revigo heatmap plot. Similar terms clustered
if (nrow(reducedTerms) > 2) {
heatmapPlot(simMatrix,
reducedTerms,
annotateParent = TRUE,
annotationLabel = "parentTerm",
fontsize = 6
)
}
This is the same content, but interactive.
if (nrow(reducedTerms) > 2) {
revigo_heatmap(simMatrix, reducedTerms)
}
Warning: Specifying width/height in layout() is now deprecated.
Please specify in ggplotly() or plot_ly()
Revigo treemap plot. Terms grouped/colored based on parent. Space is proportional to statistical significance of the GO term (-log10(pvalue)).
if (nrow(reducedTerms) > 2) {
treemapPlot(reducedTerms)
}
Use Revigo to cluster downregulated pathways
revigo_input.cellline.down <- gsea_res_df_short.down[c("p.adjust")]
rownames(revigo_input.cellline.down) <- rownames(gsea_res_df_short.down)
simMatrix <- calculateSimMatrix(rownames(revigo_input.cellline.down),
orgdb = "org.Hs.eg.db",
ont = "BP",
method = "Rel"
)
preparing gene to GO mapping data...
preparing IC data...
scores <- setNames(-log10(revigo_input.cellline.down$p.adjust), rownames(revigo_input.cellline.down))
if (nrow(revigo_input.cellline.down) > 1) {
reducedTerms <- reduceSimMatrix(simMatrix,
scores,
threshold = 0.7,
orgdb = "org.Hs.eg.db"
)
} else {
reducedTerms <- data.frame(matrix(ncol = 0, nrow = 0))
print("There will be no graphs appearing below this because there were not enough significantly downregulated pathways to meaningfully cluster them")
}
Revigo interactive scatter plot. Distances represent the similarity between terms, axes are the first 2 components of a PCA plot, Each bubble indicates the representative (chosen mostly by p-value) from a cluster of terms. Size of the bubble indicates the generality of the term (large meaning a more general term).
if (nrow(reducedTerms) > 2) {
revigo_scatterplot(simMatrix, reducedTerms)
}
Revigo heatmap plot. Similar terms clustered
if (nrow(reducedTerms) > 2) {
heatmapPlot(simMatrix,
reducedTerms,
annotateParent = TRUE,
annotationLabel = "parentTerm",
fontsize = 6
)
}
This is the same content, but interactive.
if (nrow(reducedTerms) > 2) {
revigo_heatmap(simMatrix, reducedTerms)
}
Warning: Specifying width/height in layout() is now deprecated.
Please specify in ggplotly() or plot_ly()
Revigo treemap plot. Terms grouped/colored based on parent. Space is proportional to statistical significance of the GO term (-log10(pvalue)).
if (nrow(reducedTerms) > 2) {
treemapPlot(reducedTerms)
}